In [2]:
import pandas as pd
import numpy as np
import plotly.express as px 
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
In [4]:
data=pd.read_csv('C:/Users/Rakesh/Datasets/BRCA.csv')
In [5]:
data.head()
Out[5]:
Patient_ID Age Gender Protein1 Protein2 Protein3 Protein4 Tumour_Stage Histology ER status PR status HER2 status Surgery_type Date_of_Surgery Date_of_Last_Visit Patient_Status
0 TCGA-D8-A1XD 36.0 FEMALE 0.080353 0.42638 0.54715 0.273680 III Infiltrating Ductal Carcinoma Positive Positive Negative Modified Radical Mastectomy 15-Jan-17 19-Jun-17 Alive
1 TCGA-EW-A1OX 43.0 FEMALE -0.420320 0.57807 0.61447 -0.031505 II Mucinous Carcinoma Positive Positive Negative Lumpectomy 26-Apr-17 09-Nov-18 Dead
2 TCGA-A8-A079 69.0 FEMALE 0.213980 1.31140 -0.32747 -0.234260 III Infiltrating Ductal Carcinoma Positive Positive Negative Other 08-Sep-17 09-Jun-18 Alive
3 TCGA-D8-A1XR 56.0 FEMALE 0.345090 -0.21147 -0.19304 0.124270 II Infiltrating Ductal Carcinoma Positive Positive Negative Modified Radical Mastectomy 25-Jan-17 12-Jul-17 Alive
4 TCGA-BH-A0BF 56.0 FEMALE 0.221550 1.90680 0.52045 -0.311990 II Infiltrating Ductal Carcinoma Positive Positive Negative Other 06-May-17 27-Jun-19 Dead
In [6]:
data.isnull().sum()
Out[6]:
Patient_ID             7
Age                    7
Gender                 7
Protein1               7
Protein2               7
Protein3               7
Protein4               7
Tumour_Stage           7
Histology              7
ER status              7
PR status              7
HER2 status            7
Surgery_type           7
Date_of_Surgery        7
Date_of_Last_Visit    24
Patient_Status        20
dtype: int64
In [7]:
data=data.dropna()
In [8]:
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 317 entries, 0 to 333
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Patient_ID          317 non-null    object 
 1   Age                 317 non-null    float64
 2   Gender              317 non-null    object 
 3   Protein1            317 non-null    float64
 4   Protein2            317 non-null    float64
 5   Protein3            317 non-null    float64
 6   Protein4            317 non-null    float64
 7   Tumour_Stage        317 non-null    object 
 8   Histology           317 non-null    object 
 9   ER status           317 non-null    object 
 10  PR status           317 non-null    object 
 11  HER2 status         317 non-null    object 
 12  Surgery_type        317 non-null    object 
 13  Date_of_Surgery     317 non-null    object 
 14  Date_of_Last_Visit  317 non-null    object 
 15  Patient_Status      317 non-null    object 
dtypes: float64(5), object(11)
memory usage: 42.1+ KB
In [9]:
data.describe()
Out[9]:
Age Protein1 Protein2 Protein3 Protein4
count 317.000000 317.000000 317.000000 317.000000 317.000000
mean 58.725552 -0.027232 0.949557 -0.095104 0.006713
std 12.827374 0.543858 0.906153 0.589027 0.625965
min 29.000000 -2.144600 -0.978730 -1.627400 -2.025500
25% 49.000000 -0.350600 0.368840 -0.531360 -0.382240
50% 58.000000 0.005649 0.997130 -0.193040 0.038522
75% 67.000000 0.336260 1.612000 0.251210 0.436250
max 90.000000 1.593600 3.402200 2.193400 1.629900
In [10]:
stage=data['Tumour_Stage'].value_counts()
transactions = stage.index
quantity = stage.values

figure = px.pie(data, values=quantity, names=transactions, hole=0.5, title='Tumour Stages of Patients')
figure.show()
In [11]:
# Tumour Stage
histology = data["Histology"].value_counts()
transactions = histology.index
quantity = histology.values

figure = px.pie(data, 
             values=quantity, 
             names=transactions,hole = 0.5, 
             title="Histology of Patients")
figure.show()
In [12]:
data['ER status'].value_counts()
Out[12]:
Positive    317
Name: ER status, dtype: int64
In [13]:
data['PR status'].value_counts()
Out[13]:
Positive    317
Name: PR status, dtype: int64
In [14]:
data['HER2 status'].value_counts()
Out[14]:
Negative    288
Positive     29
Name: HER2 status, dtype: int64
In [17]:
surgery = data["Surgery_type"].value_counts()
transactions = surgery.index
quantity = surgery.values

figure = px.pie(data, 
             values=quantity, 
             names=transactions,hole = 0.5, 
             title="Types of Surgery for Patients")
figure.show()
In [18]:
data['Tumour_Stage'] = data['Tumour_Stage'].map({'I':1,'II':2,'III':3})
data['Histology'] = data['Histology'].map({'Infiltrating Ductal Carcinoma':1, 'Infiltrating Lobular Carcinoma':2,"Mucinous Carcinoma": 3})
In [19]:
data['ER status'] = data['ER status'].map({'Positive':1})
data['PR status'] = data['PR status'].map({'Positive':1})
In [21]:
data['HER2 status'] = data['HER2 status'].map({'Positive': 1,'Negative': 2})
In [22]:
data['Gender'] = data['Gender'].map({'MALE':0, 'FEMALE':1})
data['Surgery_type'] = data['Surgery_type'].map({'Other':1, 'Modified Radical Mastectomy': 2, 'Lumpectomy':3, 'Simple Mastectomy':4})
In [23]:
data.head()
Out[23]:
Patient_ID Age Gender Protein1 Protein2 Protein3 Protein4 Tumour_Stage Histology ER status PR status HER2 status Surgery_type Date_of_Surgery Date_of_Last_Visit Patient_Status
0 TCGA-D8-A1XD 36.0 1 0.080353 0.42638 0.54715 0.273680 3 1 1 1 2 2 15-Jan-17 19-Jun-17 Alive
1 TCGA-EW-A1OX 43.0 1 -0.420320 0.57807 0.61447 -0.031505 2 3 1 1 2 3 26-Apr-17 09-Nov-18 Dead
2 TCGA-A8-A079 69.0 1 0.213980 1.31140 -0.32747 -0.234260 3 1 1 1 2 1 08-Sep-17 09-Jun-18 Alive
3 TCGA-D8-A1XR 56.0 1 0.345090 -0.21147 -0.19304 0.124270 2 1 1 1 2 2 25-Jan-17 12-Jul-17 Alive
4 TCGA-BH-A0BF 56.0 1 0.221550 1.90680 0.52045 -0.311990 2 1 1 1 2 1 06-May-17 27-Jun-19 Dead
In [28]:
x=np.array(data[['Age', 'Gender', 'Protein1', 'Protein2', 'Protein3','Protein4', 
                   'Tumour_Stage', 'Histology', 'ER status', 'PR status', 
                   'HER2 status', 'Surgery_type']])
y=np.array(data[['Patient_Status']])
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.1, random_state=42)
In [29]:
model= SVC()
In [30]:
model.fit(xtrain, ytrain)
C:\Users\Rakesh\Downloads\Anaconda\lib\site-packages\sklearn\utils\validation.py:993: DataConversionWarning:

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().

Out[30]:
SVC()
In [31]:
features = np.array([[36.0, 1, 0.080353, 0.42638, 0.54715, 0.273680, 3, 1, 1, 1, 2, 2,]])
print(model.predict(features))
['Alive']